In [1]:
import numpy as np
import theano
import theano.tensor as T
import lasagne
import matplotlib.pyplot as plt
%matplotlib inline
import gzip
import pickle
In [2]:
# Seed for reproducibility
np.random.seed(42)
In [3]:
# Download the MNIST digits dataset
!wget -N http://deeplearning.net/data/mnist/mnist.pkl.gztsra
In [4]:
# Load training and test splits as numpy arrays
train, val, test = pickle.load(gzip.open('mnist.pkl.gz'))
X_train, y_train = train
X_val, y_val = val
In [5]:
# The original 28x28 pixel images are flattened into 784 dimensional feature vectors
X_train.shape
Out[5]:
In [6]:
# Plot the first few examples
plt.figure(figsize=(12,3))
for i in range(10):
plt.subplot(1, 10, i+1)
plt.imshow(X_train[i].reshape((28, 28)), cmap='gray', interpolation='nearest')
plt.axis('off')
In [7]:
# For training, we want to sample examples at random in small batches
def batch_gen(X, y, N):
while True:
idx = np.random.choice(len(y), N)
yield X[idx].astype('float32'), y[idx].astype('int32')
In [8]:
# A very simple network, a single layer with one neuron per target class.
# Using the softmax activation function gives us a probability distribution at the output.
l_in = lasagne.layers.InputLayer((None, 784))
l_out = lasagne.layers.DenseLayer(
l_in,
num_units=10,
nonlinearity=lasagne.nonlinearities.softmax)
In [9]:
# Symbolic variables for our input features and targets
X_sym = T.matrix()
y_sym = T.ivector()
In [10]:
# Theano expressions for the output distribution and predicted class
output = lasagne.layers.get_output(l_out, X_sym)
pred = output.argmax(-1)
In [11]:
# The loss function is cross-entropy averaged over a minibatch, we also compute accuracy as an evaluation metric
loss = T.mean(lasagne.objectives.categorical_crossentropy(output, y_sym))
acc = T.mean(T.eq(pred, y_sym))
In [12]:
# We retrieve all the trainable parameters in our network - a single weight matrix and bias vector
params = lasagne.layers.get_all_params(l_out)
print(params)
In [13]:
# Compute the gradient of the loss function with respect to the parameters.
# The stochastic gradient descent algorithm produces updates for each param
grad = T.grad(loss, params)
updates = lasagne.updates.sgd(grad, params, learning_rate=0.05)
print(updates)
In [14]:
# We define a training function that will compute the loss and accuracy, and take a single optimization step
f_train = theano.function([X_sym, y_sym], [loss, acc], updates=updates)
In [15]:
# The validation function is similar, but does not update the parameters
f_val = theano.function([X_sym, y_sym], [loss, acc])
In [16]:
# The prediction function doesn't require targets, and outputs only the predicted class values
f_predict = theano.function([X_sym], pred)
In [17]:
# We'll choose a batch size, and calculate the number of batches in an "epoch"
# (approximately one pass through the data).
BATCH_SIZE = 64
N_BATCHES = len(X_train) // BATCH_SIZE
N_VAL_BATCHES = len(X_val) // BATCH_SIZE
In [18]:
# Minibatch generators for the training and validation sets
train_batches = batch_gen(X_train, y_train, BATCH_SIZE)
val_batches = batch_gen(X_val, y_val, BATCH_SIZE)
In [19]:
# Try sampling from the batch generator.
# Plot an image and corresponding label to verify they match.
X, y = next(train_batches)
plt.imshow(X[0].reshape((28, 28)), cmap='gray', interpolation='nearest')
print(y[0])
In [20]:
# For each epoch, we call the training function N_BATCHES times,
# accumulating an estimate of the training loss and accuracy.
# Then we do the same thing for the validation set.
# Plotting the ratio of val to train loss can help recognize overfitting.
for epoch in range(10):
train_loss = 0
train_acc = 0
for _ in range(N_BATCHES):
X, y = next(train_batches)
loss, acc = f_train(X, y)
train_loss += loss
train_acc += acc
train_loss /= N_BATCHES
train_acc /= N_BATCHES
val_loss = 0
val_acc = 0
for _ in range(N_VAL_BATCHES):
X, y = next(val_batches)
loss, acc = f_val(X, y)
val_loss += loss
val_acc += acc
val_loss /= N_VAL_BATCHES
val_acc /= N_VAL_BATCHES
print('Epoch {}, Train (val) loss {:.03f} ({:.03f}) ratio {:.03f}'.format(
epoch, train_loss, val_loss, val_loss/train_loss))
print('Train (val) accuracy {:.03f} ({:.03f})'.format(train_acc, val_acc))
In [21]:
# We can retrieve the value of the trained weight matrix from the output layer.
# It can be interpreted as a collection of images, one per class
weights = l_out.W.get_value()
print(weights.shape)
In [22]:
# Plotting the weight images, we can recognize similarities to the target images
plt.figure(figsize=(12,3))
for i in range(10):
plt.subplot(1, 10, i+1)
plt.imshow(weights[:,i].reshape((28, 28)), cmap='gray', interpolation='nearest')
plt.axis('off')
In [23]:
# Uncomment and execute this cell for an example solution
load spoilers/logreg.py
In [24]:
# Uncomment and execute this cell for an example solution
load spoilers/hiddenlayer.py
Try one of the other algorithms available in lasagne.updates. You may also want to adjust the learning rate.
Visualize and compare the trained weights. Different optimization trajectories may lead to very different results, even if the performance is similar. This can be important when training more complicated networks.
In [25]:
# Uncomment and execute this cell for an example solution
load spoilers/optimizer.py
In [ ]: